This notebook will contain the codes and documentations for spam analysis. A report is generated on the analysis in the dropbox paper https://paper.dropbox.com/doc/Spam-Analysis-of-Maya-questions-NubDXwEKR6NDOBghYGgQ4.
First we will connect to our database.
In [1]:
from database import Database
database = Database(
'<host name>',
'<database name>',
'<user name>',
'<password>',
'utf8mb4'
)
In [ ]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
if connection:
try:
with connection.cursor() as cursor:
query = "select count(*) as count, source from questions where status='spam' GROUP BY source,user_id is not null"
cursor.execute(query)
data = cursor.fetchall()
df2 = pd.DataFrame(data)
df2.plot.bar()
plt.show()
print data
finally:
connection.close()
In [4]:
def create_spam_type_table(cursor, connection):
create_schema_sql = "CREATE TABLE spam_type(id int(11) unsigned NOT NULL AUTO_INCREMENT,question_id int(10) unsigned NOT NULL,type enum('repeat','abusive','random','greeting','irrelevant','test') DEFAULT NULL,PRIMARY KEY (id),KEY question_id (question_id),CONSTRAINT spam_type_ibfk_1 FOREIGN KEY (question_id) REFERENCES questions (id)) ENGINE=InnoDB DEFAULT CHARSET=utf8"
cursor.execute(create_schema_sql)
connection.commit()
def create_spam_repeat_table(cursor, connection):
create_schema_sql = "CREATE TABLE spam_by_repeat(id int(11) unsigned NOT NULL AUTO_INCREMENT,question_id int(10) unsigned NOT NULL,parent_id int(10) unsigned NOT NULL,is_same_user tinyint(1) NOT NULL,time_dif int(11) unsigned NOT NULL,PRIMARY KEY (id),KEY question_id (question_id),KEY parent_id (parent_id),CONSTRAINT spam_by_repeat_ibfk_1 FOREIGN KEY (question_id) REFERENCES questions (id),CONSTRAINT spam_by_repeat_ibfk_2 FOREIGN KEY (parent_id) REFERENCES questions (id)) ENGINE=InnoDB DEFAULT CHARSET=utf8"
cursor.execute(create_schema_sql)
connection.commit()
In [ ]:
from pyxdameraulevenshtein import damerau_levenshtein_distance as dl_distance
def check_for_greeting(sentence):
greeting_word = ['hi', 'hey', 'hello', 'bye', 'thank', 'কেমন'.decode('utf-8')]
if len(sentence.split(' ')) < 10:
# greetings
for words in sentence.split(' '):
for i in greeting_word:
if dl_distance(i, words) <= 1:
sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','greeting')"
cursor.execute(sql)
connection.commit()
return True
return False
In [ ]:
def check_for_test(sentence):
testing_word = ['test', 'check', 'testing', 'checking']
if len(sentence.split(' ')) < 10:
# greetings
for words in sentence.split(' '):
for i in testing_word:
if dl_distance(i, words) <= 1 or i in sentence:
sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','test')"
cursor.execute(sql)
connection.commit()
return True
return False
In [ ]:
def mark_as_random(record):
sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','random')"
cursor.execute(sql)
connection.commit()
In [ ]:
def check_for_irrelevant(sentence):
irrelevant_word = ['voice']
if len(sentence.split(' ')) < 10:
# greetings
for words in sentence.split(' '):
for i in irrelevant_word:
if dl_distance(i, words) <= 1:
sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','irrelevant')"
cursor.execute(sql)
connection.commit()
return True
return False
In [ ]:
def check_for_abusive(sentence):
abusive_word = ['sex','সেক্স'.decode('utf-8'),'যৌন'.decode('utf-8'),'দুধ'.decode('utf-8'),'চুদ'.decode('utf-8'), 'লিঙ্গ'.decode('utf-8')]
# greetings
for words in sentence.split(' '):
for i in abusive_word:
if dl_distance(i, words) <= 1 or i in sentence:
sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','abusive')"
cursor.execute(sql)
connection.commit()
return True
return False
In [ ]:
def check_for_repeat(record):
sql = "SELECT id, email, created_at FROM questions WHERE id < " + str(record['id']) + " and body='" + record['body'] + "'"
cursor.execute(sql)
result = cursor.fetchall()
if result:
match = 0
for i in result:
if i['id'] > match:
match = i['id']
data = i
sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','repeat')"
cursor.execute(sql)
connection.commit()
if (data['email'] == record['email']):
sql = "INSERT INTO spam_by_repeat(question_id, parent_id, is_same_user, time_dif) VALUES('" + str(record['id']) + "','" + str(data['id']) + "','1','" + str(abs((record['created_at'] - data['created_at']).total_seconds())) + "')"
else:
sql = "INSERT INTO spam_by_repeat(question_id, parent_id, is_same_user, time_dif) VALUES('" + str(record['id']) + "','" + str(data['id']) + "','0','" + str(abs((record['created_at'] - data['created_at']).total_seconds())) + "')"
cursor.execute(sql)
connection.commit()
return True
return False
In [5]:
connection = database.connect_with_pymysql()
if connection:
try:
with connection.cursor() as cursor:
create_spam_type_table(cursor, connection)
create_spam_repeat_table(cursor, connection)
sql = "SELECT id, body, email, source, created_at FROM questions WHERE status='spam'"
cursor.execute(sql)
data = cursor.fetchall()
for record in data:
if record['body']:
# no vowel
if check_for_greeting(record['body']):
continue
elif check_for_test(record['body']):
continue
elif check_for_irrelevant(record['body']):
continue
elif len(record['body'].split(' ')) <= 3:
mark_as_random(record)
continue
elif check_for_repeat(record):
continue
elif check_for_abusive(record['body']):
continue
sql = "INSERT INTO spam_type(question_id) VALUES('" + str(record['id']) + "')"
cursor.execute(sql)
connection.commit()
else:
# random meaningless characters or blank message
mark_as_random(record)
finally:
connection.close()
In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
timeline = []
connection = database.connect_with_pymysql()
if connection:
try:
with connection.cursor() as cursor:
sql = "SELECT time_dif FROM spam_by_repeat where is_same_user=0"
cursor.execute(sql)
result = cursor.fetchall()
finally:
connection.close()
for i in result:
timeline.append(i['time_dif']/60)
plt.style.use('ggplot')
ranges = [0, 2, 60, 1440, 10080, sys.maxint]
col = ['<2min', '2-60min', '1-24hr', '1-7day', '>1week']
val = np.zeros(5)
for i in range(len(ranges)-1):
for j in timeline:
if ranges[i] <= j < ranges[i+1]:
val[i] += 1
df2 = pd.DataFrame(np.array(val), col, columns=['Count of repeated questions by time by different user'])
ax = df2.plot.bar()
for p in ax.patches:
b=p.get_bbox()
ax.annotate("{}".format(int(b.y1 + b.y0)), ((b.x0 + b.x1)/2 - 0.1, b.y1))
plt.show()